<?php
class Token{
 
 public $count = 0;
 
 public $key = null;
 
 private $subTokens = array();
 
 // type could be "object" or "category"
 
 public $type = null;
 
 
 
 public static function create($key, $type="object", $MIN_LEN=2, $MAX_LEN=6){
 
  $t = new Token;
 
  $t->key = $key;
 
  $t->type = $type;
 
 
 
  $len = mb_strlen($key, "UTF-8");
 
  if($len > $MIN_LEN){
 
   if($len > $MAX_LEN){
 
    $len = $MAX_LEN + 1;
 
   }
 
   $strs = self::extractTokenStrs($key, $len-1, $len-1);
 
   foreach($strs as $str){
 
    $t->addSubToken($str);
 
   }
 
  }
 
  return $t;
 
 }
 
 
 
 public function addSubToken($key){
 
  if(!isset($this->subTokens[$key])){
 
   $this->subTokens[$key]=1;
 
  }
 
  else{
 
   $this->subTokens[$key]++;
 
  }
 
 }
 
 
 
 public function getSubTokens(){
  return array_keys($this->subTokens);
 }
 
 
 
 public static function extractTokenStrs($string, $MIN_LEN=2, $MAX_LEN=6){
 
  $tokenStrs = array();
 
  //match the 2-char, 3-char, 4-char, 5-char and 6-char tokens and put it into $rawtokens
 
  $len = mb_strlen($string, "UTF-8");
 
 
 
  if($MAX_LEN >= $MIN_LEN){
 
   for($k = $MAX_LEN; $k >= $MIN_LEN; $k--){
 
    for($j = 0; $j <= $len-$MIN_LEN; $j++){
 
     $subStr = mb_substr($string, $j, $k, "UTF-8");
 
     if(mb_strlen($subStr, "UTF-8") != $k){
 
      break;
 
     }
 
     $tokenStrs[] = $subStr;
 
    }
 
   }
 
  }
 
  return $tokenStrs;
 
 }
 
 
 
 
 
}
 
class Cloud{
 
 private $keywords = array();
 
 private $categories = array();
 
 
 
 
 
 private function addToken($string){
 
  if(empty($string)){
 
   return false;
 
  }
 
 
 
  if(!isset($this->keywords[$string])){
 
   $this->keywords[$string] = Token::create($string, "object");
 
  }
 
  $this->keywords[$string]->count++;
 
  return $this->keywords[$string];
 
 }
 
 
 
 private function addCategory($string){
 
  if(empty($string)){
 
   return false;
 
  }
 
 
 
  if(!isset($this->categories[$string])){
 
   $this->categories[$string] = Token::create($string, "category");
 
  }
 
  return $this->categories[$string];
 
 }
 
 
 
 private function emptyCategories(){
 
  $this->categories = array();
 
 }
 
 
 
 public function addTokens($string){
 
  $strs = self::split($string, array("的","了","然后","[\pP\pS\pZ\pC\pM]+","[0-9a-zA-Z]+"));
 
  foreach($strs as $str){
   if(mb_strlen($str, "UTF-8") >= 2){
 
    $this->addCategory($str);
 
 
   }
 
  }
 
 
 
 }
 
 private function emptyKeywordsList(){
  $this->keywords = array();
 }
 
 public function buildKeyWordsList(){
  $this->emptyKeywordsList();
  foreach($this->categories as $c){
   $ss = Token::extractTokenStrs($c->key, 2, 6);
   foreach($ss as $s){
    $this->addToken($s);
   }
  }
 }
 
 
 
 public static function longest_string_in_array($array) {
 
  $mapping = array_combine($array, array_map('mb_strlen', $array));
 
  return array_keys($mapping, max($mapping));
 
 }
 
 
 
 public function findBestMatchingWords($threshold){
 
  /* find the longest words satisfies the threshold */
 
  $keywords = array();
 
  foreach($this->categories as $k){
 
   $this->extractCategoryKeywords($k, $threshold, $keywords);
 
  }
 
  $keys = array_keys($keywords);
 
  if(count($keywords)===0){
 
   return false;
 
  }
 
  $lkeys = self::longest_string_in_array($keys);
 
  $ret = array();
 
  foreach($lkeys as $k){
 
   $ret[$k] = $keywords[$k];
 
  }
 
  return $ret;
 
 }
 
 private function removeInfluence(Token $token){
  $num = $token->count;
  $ss = Token::extractTokenStrs($token->key, 2, 6);
 
  foreach($ss as $s){
   $this->keywords[$s]->count-=$num;
  }
 }
 
 
 
 public function getKeywords($threshold = 3){
 
  $this->buildKeyWordsList();
 
  $stopList = array();
  $keywords = array();
 
 
  $categoriesToRestore = $this->categories;
  while(true){
 
 
 
   $bestWords = $this->findBestMatchingWords($threshold);
 
 
 
   if($bestWords === false){
 
    break;
 
   }
 
 
 
   // add it to stopList
 
   // add it the keywords
 
   foreach($bestWords as $w){
 
    $keywords[$w->key]= $w;
    $this->removeInfluence($w);
    $stopList[] = $w->key;
 
   }
 
 
 
   // use the stop list to rebuild the categories
 
   $oldCategories = $this->categories;
 
   $this->emptyCategories();
 
   foreach($oldCategories as $c){
 
    $list = self::split($c->key, $stopList);
 
    foreach( $list as $nc){
 
     $this->addCategory($nc);
 
    }
 
   }
 
  }
 
  $this->categories = $categoriesToRestore;
  return $keywords;
 
 }
 
 
 
 public static function split($string, $delimiters){
 
  //build the patterns
 
  $pattern = "/";
 
 
 
  foreach($delimiters as $c => $d){
 
   if($c === 0){
 
    $pattern = $pattern.$d;
 
   }
 
   else{
 
    $pattern = $pattern."|".$d;
 
   }
 
  }
 
  $patterns = $pattern."/isu";
 
  $rawtokens = preg_split($patterns, $string);
 
  return $rawtokens;
 
 }
 
 
 
 public function extractCategoryKeywords($root, $threshold, &$keywords=null){
 
 
 
  $key = $root->key;
 
 
 
  if($keywords === null){
 
   $keywords = array();
 
  }
 
 
 
  if(isset($keywords[$key])){
 
   return;
 
  }
 
 
 
  if(isset($this->keywords[$key]) && $this->keywords[$key]->count >= $threshold){
 
   $keywords[$key]=$this->keywords[$key];
 
  }
 
  else{
 
   $subTokens = $root->getSubTokens();
 
   foreach( $subTokens as $s){
 
    $this->extractCategoryKeywords($this->keywords[$s], $threshold, $keywords);
 
   }
 
  }
 
 }
 
}// Cloud ends
 
set_time_limit ( 1000 );
$string = <<<EOT
meta http-equiv="Content-Type" content="text/html; charset=UTF-8"
meta name="description" content="[回到部落格首頁]"
EOT;
$cloud = new Cloud();
$cloud->addTokens($string);
print_r(array_keys($cloud->getKeywords(3)));
echo "hello";